DiagrammeR::grViz("digraph {
graph [layout = dot, rankdir = TB]
node [shape = rectangle]
rec1 [label = '1. Introduction']
rec2 [label = '2. Dataset and the Library Used']
rec3 [label = '3. Exploratory Data Analysis']
rec4 [label = '4. Predictive Analysis']
rec5 [label = '5. Recommendations/Conclusions']
# edge definitions with the node IDs
rec1 -> rec2 -> rec3 -> rec4 -> rec5
}",
height = 500)
The 2009 Canadian Internet Use Survey dataset was taken from the Stats Canada website, and it is about the non-use of the internet by individual residents of Canada 15 years of age and over, living in the provinces in the year of 2009. It consists of 24 columns and 23179 rows. This dataset is important because it measures barriers to Internet access and use, online services, and using various digital technologies. With this information, the government can adopt policies to improve the conditions of this deficiency and to plan steps to make the internet accessible to everyone.
nonUse <- read.csv('nonUse.csv', header = TRUE)
library("ggplot2")
library("tidyverse")
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ✔ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library("dplyr")
library("janitor")
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library("gplots")
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
library(caTools)
library(ROCR)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(e1071)
library(caTools)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(corrplot)
## corrplot 0.92 loaded
###3.1 Count of missing values for each column
sapply(nonUse,function(x) sum(is.na(nonUse)))
## PROVINCE REGION G_URBRUR GCAGEGR6 CSEX G_CEDUC G_CSTUD G_CLFSST
## 0 0 0 0 0 0 0 0
## GFAMTYPE G_HHSIZE G_HEDUC G_HSTUD EV_Q01 EV_Q02 NU_Q01 NU_Q02A
## 0 0 0 0 0 0 0 0
## NU_Q02B NU_Q02C NU_Q02D NU_Q02E NU_Q02F NU_Q02I NU_G02K NU_G02
## 0 0 0 0 0 0 0 0
#5 Removing empty Columns and Rows - If any
nonUse<-nonUse %>% remove_empty(whic=c("rows"))
nonUse<-nonUse %>% remove_empty(whic=c("cols"))
###3.2 To check the no of unique values for each column
sapply(nonUse,function(x) length(unique(x)))
## PROVINCE REGION G_URBRUR GCAGEGR6 CSEX G_CEDUC G_CSTUD G_CLFSST
## 10 6 6 6 2 3 2 3
## GFAMTYPE G_HHSIZE G_HEDUC G_HSTUD EV_Q01 EV_Q02 NU_Q01 NU_Q02A
## 4 4 3 2 2 7 4 6
## NU_Q02B NU_Q02C NU_Q02D NU_Q02E NU_Q02F NU_Q02I NU_G02K NU_G02
## 6 6 6 6 6 6 6 4
###3.3 Summary
summary(nonUse)
## PROVINCE REGION G_URBRUR GCAGEGR6 CSEX
## Min. :10.00 Min. :1.00 Min. :1.000 Min. :1.00 Min. :1.000
## 1st Qu.:24.00 1st Qu.:2.00 1st Qu.:4.000 1st Qu.:3.00 1st Qu.:1.000
## Median :35.00 Median :3.00 Median :4.000 Median :4.00 Median :2.000
## Mean :34.75 Mean :3.16 Mean :4.131 Mean :3.94 Mean :1.553
## 3rd Qu.:47.00 3rd Qu.:4.00 3rd Qu.:5.000 3rd Qu.:5.00 3rd Qu.:2.000
## Max. :59.00 Max. :6.00 Max. :6.000 Max. :6.00 Max. :2.000
## G_CEDUC G_CSTUD G_CLFSST GFAMTYPE
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:2.000
## Median :2.000 Median :2.000 Median :1.000 Median :2.000
## Mean :1.796 Mean :1.926 Mean :1.785 Mean :2.101
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :3.000 Max. :2.000 Max. :3.000 Max. :4.000
## G_HHSIZE G_HEDUC G_HSTUD EV_Q01
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:1.000
## Median :2.000 Median :2.000 Median :2.000 Median :1.000
## Mean :2.272 Mean :2.004 Mean :1.824 Mean :1.242
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:1.000
## Max. :4.000 Max. :3.000 Max. :2.000 Max. :2.000
## EV_Q02 NU_Q01 NU_Q02A NU_Q02B
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:4.000 1st Qu.:2.000 1st Qu.:6.000 1st Qu.:6.000
## Median :4.000 Median :6.000 Median :6.000 Median :6.000
## Mean :4.221 Mean :4.947 Mean :5.023 Mean :5.001
## 3rd Qu.:4.000 3rd Qu.:6.000 3rd Qu.:6.000 3rd Qu.:6.000
## Max. :8.000 Max. :7.000 Max. :9.000 Max. :9.000
## NU_Q02C NU_Q02D NU_Q02E NU_Q02F NU_Q02I
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1 Min. :1.000
## 1st Qu.:6.000 1st Qu.:6.000 1st Qu.:6.000 1st Qu.:6 1st Qu.:6.000
## Median :6.000 Median :6.000 Median :6.000 Median :6 Median :6.000
## Mean :5.026 Mean :4.992 Mean :5.032 Mean :5 Mean :4.955
## 3rd Qu.:6.000 3rd Qu.:6.000 3rd Qu.:6.000 3rd Qu.:6 3rd Qu.:6.000
## Max. :9.000 Max. :9.000 Max. :9.000 Max. :9 Max. :9.000
## NU_G02K NU_G02
## Min. :1.000 Min. :1.000
## 1st Qu.:6.000 1st Qu.:6.000
## Median :6.000 Median :6.000
## Mean :5.005 Mean :5.023
## 3rd Qu.:6.000 3rd Qu.:6.000
## Max. :9.000 Max. :9.000
#3.4 Check sum of Null & NA values
sum(is.na(nonUse))
## [1] 0
sum(is.null(nonUse))
## [1] 0
#3.5 Checking the total no of rows and columns
dim(nonUse)
## [1] 23178 24
#3.6 Checking the first and last few rows
head(nonUse)
## PROVINCE REGION G_URBRUR GCAGEGR6 CSEX G_CEDUC G_CSTUD G_CLFSST GFAMTYPE
## 1 35 3 5 3 2 3 2 1 3
## 2 46 4 5 1 2 1 1 2 2
## 3 10 1 5 2 1 2 2 1 2
## 4 35 3 4 5 2 2 2 3 3
## 5 13 1 4 3 1 2 2 1 2
## 6 46 4 4 2 2 1 2 3 1
## G_HHSIZE G_HEDUC G_HSTUD EV_Q01 EV_Q02 NU_Q01 NU_Q02A NU_Q02B NU_Q02C NU_Q02D
## 1 1 3 2 1 2 6 6 6 6 6
## 2 3 2 1 1 4 6 6 6 6 6
## 3 2 3 2 1 4 6 6 6 6 6
## 4 1 2 2 1 3 2 6 6 6 6
## 5 3 2 2 1 2 6 6 6 6 6
## 6 3 1 2 1 4 2 6 6 6 6
## NU_Q02E NU_Q02F NU_Q02I NU_G02K NU_G02
## 1 6 6 6 6 6
## 2 6 6 6 6 6
## 3 6 6 6 6 6
## 4 6 6 6 6 6
## 5 6 6 6 6 6
## 6 6 6 6 6 6
tail(nonUse)
## PROVINCE REGION G_URBRUR GCAGEGR6 CSEX G_CEDUC G_CSTUD G_CLFSST GFAMTYPE
## 23173 24 2 4 1 1 1 1 1 2
## 23174 46 4 5 3 1 1 2 3 3
## 23175 46 4 4 4 2 1 2 1 2
## 23176 10 1 4 2 2 2 1 3 4
## 23177 35 3 4 6 2 1 2 3 3
## 23178 59 6 4 5 2 3 2 3 2
## G_HHSIZE G_HEDUC G_HSTUD EV_Q01 EV_Q02 NU_Q01 NU_Q02A NU_Q02B NU_Q02C
## 23173 2 1 1 1 4 6 6 6 6
## 23174 1 1 2 2 6 2 2 1 2
## 23175 3 1 2 2 6 2 2 2 2
## 23176 2 2 1 1 4 6 6 6 6
## 23177 1 1 2 1 1 6 6 6 6
## 23178 2 3 2 1 3 6 6 6 6
## NU_Q02D NU_Q02E NU_Q02F NU_Q02I NU_G02K NU_G02
## 23173 6 6 6 6 6 6
## 23174 2 2 2 2 2 2
## 23175 2 2 1 1 1 2
## 23176 6 6 6 6 6 6
## 23177 6 6 6 6 6 6
## 23178 6 6 6 6 6 6
nonUse$PROVINCE1 = ifelse(nonUse$PROVINCE == 10, "Newfoundland and Labrador",
ifelse(nonUse$PROVINCE == 11,"Prince Edward Island",
ifelse(nonUse$PROVINCE == 12,"Nova Scotia",
ifelse(nonUse$PROVINCE == 13,"New Brunswick",
ifelse(nonUse$PROVINCE == 24,"Quebec",
ifelse(nonUse$PROVINCE == 35,"Ontario",
ifelse(nonUse$PROVINCE == 46,"Manitoba",
ifelse(nonUse$PROVINCE == 47,"Saskatchewan",
ifelse(nonUse$PROVINCE == 48,"Alberta","British Columbia")))))))))
nonUse$REGION1 = ifelse(nonUse$REGION == 1, "Atlantic Region",
ifelse(nonUse$REGION == 2,"Quebec",
ifelse(nonUse$REGION == 3,"Ontario",
ifelse(nonUse$REGION == 4,"Manitoba/Saskatchewan",
ifelse(nonUse$REGION == 5,"Alberta","British Columbia")))))
nonUse$COMMUNITY = ifelse(nonUse$G_URBRUR == 01, "Montreal",
ifelse(nonUse$G_URBRUR == 02,"Toronto",
ifelse(nonUse$G_URBRUR == 03,"Vancouver",
ifelse(nonUse$G_URBRUR == 04,"Other Urban excluding Prince Edward Island",
ifelse(nonUse$G_URBRUR == 05,"Rural excluding Prince Edward Island","Prince Edward Island")))))
nonUse$AGEGRP = ifelse(nonUse$GCAGEGR6 == 01, "16 to 24",
ifelse(nonUse$GCAGEGR6 == 02,"25 to 34",
ifelse(nonUse$GCAGEGR6 == 03,"35 to 44",
ifelse(nonUse$GCAGEGR6 == 04,"45 to 54",
ifelse(nonUse$GCAGEGR6 == 05,"55 to 64","65 and older")))))
nonUse$GENDER = ifelse(nonUse$CSEX == 1, "MALE", "FEMALE")
nonUse$EDUCATIONLEVEL = ifelse(nonUse$G_CEDUC == 1, "High school or less",
ifelse(nonUse$G_CEDUC == 2, "College or some post-secondary", "University certificate or degree"))
nonUse$ISSTUDENT = ifelse(nonUse$G_CSTUD == 1, "Yes", "No")
nonUse$EMPLOYMENTSTATUS = ifelse(nonUse$G_CLFSST == 1, "Employed",
ifelse(nonUse$G_CLFSST == 2, "Unemployed", "Not in the labour force"))
nonUse$HOUSEHOLDTYPE = ifelse(nonUse$GFAMTYPE == 1, "Single family household with unmarried children under 16",
ifelse(nonUse$GFAMTYPE == 2, "Single family household without unmarried children under 16",
ifelse(nonUse$GFAMTYPE == 3, "One person households", "Multi family households")))
nonUse$HOUSEHOLDSIZE = ifelse(nonUse$G_HHSIZE == 1, "1 person",
ifelse(nonUse$G_HHSIZE == 2, "2 persons",
ifelse(nonUse$G_HHSIZE == 3, "3 persons", "4 or more persons")))
nonUse$HOUSEHOLDEDUCATION = ifelse(nonUse$G_HEDUC == 1, "High school or less",
ifelse(nonUse$G_HEDUC == 2, "College or some post-secondary", "University certificate or degree"))
nonUse$STUDENTINHOUSEHOLD = ifelse(nonUse$G_HSTUD == 1, "Yes", "No")
nonUse$ISINTERNETUSER = ifelse(nonUse$EV_Q01 == 1, "Yes", "No")
nonUse$INTERNETUSEDURATION = ifelse(nonUse$EV_Q02 == 1, "Less than 1 year",
ifelse(nonUse$EV_Q02 == 2,"1 to 2 years",
ifelse(nonUse$EV_Q02 == 3,"2 to 5 years",
ifelse(nonUse$EV_Q02 == 4,"5 or more years",
ifelse(nonUse$EV_Q02 == 6,"Valid skip",
ifelse(nonUse$EV_Q02 == 7,"Don't know", "Refusal"))))))
nonUse$nonUse = ifelse(nonUse$NU_Q01 == 1, "Yes",
ifelse(nonUse$NU_Q01 == 2, "No",
ifelse(nonUse$NU_Q01 == 6, "Valid skip","Don't know")))
nonUse$NU_COST = ifelse(nonUse$NU_Q02A == 1, "Yes",
ifelse(nonUse$NU_Q02A == 2,"No",
ifelse(nonUse$NU_Q02A == 6,"Valid skip",
ifelse(nonUse$NU_Q02A == 7,"Don't know",
ifelse(nonUse$NU_Q02A == 8,"Refusal","Not stated")))))
nonUse$NU_LIMITEDACCESS = ifelse(nonUse$NU_Q02B == 1, "Yes",
ifelse(nonUse$NU_Q02B == 2,"No",
ifelse(nonUse$NU_Q02B == 6,"Valid skip",
ifelse(nonUse$NU_Q02B == 7,"Don't know",
ifelse(nonUse$NU_Q02B == 8,"Refusal","Not stated")))))
nonUse$NU_DIFFICULT = ifelse(nonUse$NU_Q02C == 1, "Yes",
ifelse(nonUse$NU_Q02C == 2,"No",
ifelse(nonUse$NU_Q02C == 6,"Valid skip",
ifelse(nonUse$NU_Q02C == 7,"Don't know",
ifelse(nonUse$NU_Q02C == 8,"Refusal","Not stated")))))
nonUse$NU_NONEED = ifelse(nonUse$NU_Q02D == 1, "Yes",
ifelse(nonUse$NU_Q02D == 2,"No",
ifelse(nonUse$NU_Q02D == 6,"Valid skip",
ifelse(nonUse$NU_Q02D == 7,"Don't know",
ifelse(nonUse$NU_Q02D == 8,"Refusal","Not stated")))))
nonUse$NU_NOTIME = ifelse(nonUse$NU_Q02E == 1, "Yes",
ifelse(nonUse$NU_Q02E == 2,"No",
ifelse(nonUse$NU_Q02E == 6,"Valid skip",
ifelse(nonUse$NU_Q02E == 7,"Don't know",
ifelse(nonUse$NU_Q02E == 8,"Refusal","Not stated")))))
nonUse$NU_LACKOFSKILLS = ifelse(nonUse$NU_Q02F == 1, "Yes",
ifelse(nonUse$NU_Q02F == 2,"No",
ifelse(nonUse$NU_Q02F == 6,"Valid skip",
ifelse(nonUse$NU_Q02F == 7,"Don't know",
ifelse(nonUse$NU_Q02F == 8,"Refusal","Not stated")))))
nonUse$NU_NOINTEREST = ifelse(nonUse$NU_Q02I == 1, "Yes",
ifelse(nonUse$NU_Q02I == 2,"No",
ifelse(nonUse$NU_Q02I == 6,"Valid skip",
ifelse(nonUse$NU_Q02I == 7,"Don't know",
ifelse(nonUse$NU_Q02I == 8,"Refusal","Not stated")))))
nonUse$NU_AGEREASONS = ifelse(nonUse$NU_G02K == 1, "Yes",
ifelse(nonUse$NU_G02K == 2,"No",
ifelse(nonUse$NU_G02K == 6,"Valid skip",
ifelse(nonUse$NU_G02K == 7,"Don't know",
ifelse(nonUse$NU_G02K == 8,"Refusal","Not stated")))))
nonUse$NU_OTHERS = ifelse(nonUse$NU_G02 == 1, "Yes",
ifelse(nonUse$NU_G02 == 2,"No",
ifelse(nonUse$NU_G02 == 6,"Valid skip","Not stated")))
nonUse_new <- nonUse[,25:48]
Gender = nonUse_new %>% select(GENDER)
gender_chart <- as.data.frame(table(Gender$GENDER)) #created a frequency table
ggplot(gender_chart, aes(x = Var1, y = Freq, fill = Var1)) +
geom_col() +
geom_text(aes(label = Freq), vjust = 3.5, colour = "white")
Region = nonUse_new %>% select(REGION1)
gender_chart <- as.data.frame(table(Region$REGION1)) #created a frequency table
ggplot(gender_chart, aes(x = Var1, y = Freq , fill = Var1)) +
geom_col() +
geom_text(aes(label = Freq), vjust = 3.5, colour = "white")
agegrp = nonUse_new %>% select(AGEGRP)
gender_chart <- as.data.frame(table(agegrp$AGEGRP)) #created a frequency table
ggplot(gender_chart, aes(x = Var1, y = Freq, fill = Var1)) +
geom_col() +
geom_text(aes(label = Freq), vjust = 1.5, colour = "white")
province = nonUse_new %>% select(PROVINCE1)
gender_chart <- as.data.frame(table(province$PROVINCE1)) #created a frequency table
ggplot(gender_chart, aes(x = Var1, y = Freq, fill = Var1)) +
geom_col() +
geom_text(aes(label = Freq), vjust = 3.5, colour = "white")
dt <- as.table(as.matrix(table(nonUse_new$PROVINCE1,nonUse_new$GENDER)))
balloonplot(t(dt), main ="Province is the reason for Non use across Gender", xlab ="", ylab="",
label = FALSE, show.margins = FALSE)
dt <- as.table(as.matrix(table(nonUse_new$AGEGRP,nonUse_new$GENDER)))
balloonplot(t(dt), main ="AgeGrp is the reason for Non use across Gender", xlab ="", ylab="",
label = FALSE, show.margins = FALSE)
dt_cs <- as.table(as.matrix(table(nonUse_new$ISINTERNETUSER,nonUse_new$AGEGRP)))
chisq <- chisq.test(dt_cs)
round(chisq$residuals, 3)
##
## 16 to 24 25 to 34 35 to 44 45 to 54 55 to 64 65 and older
## No -20.550 -24.685 -21.819 -11.044 5.219 56.280
## Yes 11.598 13.931 12.314 6.233 -2.946 -31.763
corrplot(chisq$residuals, is.cor = FALSE)
dt_cs <- as.table(as.matrix(table(nonUse_new$ISINTERNETUSER,nonUse_new$GENDER)))
chisq <- chisq.test(dt_cs)
round(chisq$residuals, 3)
##
## FEMALE MALE
## No 1.651 -1.836
## Yes -0.932 1.036
corrplot(chisq$residuals, is.cor = FALSE)
dt_cs <- as.table(as.matrix(table(nonUse_new$ISINTERNETUSER,nonUse_new$PROVINCE1)))
chisq <- chisq.test(dt_cs)
round(chisq$residuals, 3)
##
## Alberta British Columbia Manitoba New Brunswick Newfoundland and Labrador
## No -5.096 -7.434 2.819 4.767 7.052
## Yes 2.876 4.195 -1.591 -2.690 -3.980
##
## Nova Scotia Ontario Prince Edward Island Quebec Saskatchewan
## No 2.569 -4.499 0.752 4.465 1.966
## Yes -1.450 2.539 -0.424 -2.520 -1.109
corrplot(chisq$residuals, is.cor = FALSE)
dt_cs <- as.table(as.matrix(table(nonUse_new$ISINTERNETUSER,nonUse_new$REGION1)))
chisq <- chisq.test(dt_cs)
round(chisq$residuals, 3)
##
## Alberta Atlantic Region British Columbia Manitoba/Saskatchewan Ontario
## No -5.096 7.710 -7.434 3.411 -4.499
## Yes 2.876 -4.351 4.195 -1.925 2.539
##
## Quebec
## No 4.465
## Yes -2.520
corrplot(chisq$residuals, is.cor = FALSE)
dt_cs <- as.table(as.matrix(table(nonUse_new$ISINTERNETUSER,nonUse_new$EDUCATIONLEVEL)))
chisq <- chisq.test(dt_cs)
round(chisq$residuals, 3)
##
## College or some post-secondary High school or less
## No -15.348 33.457
## Yes 8.662 -18.882
##
## University certificate or degree
## No -25.382
## Yes 14.325
corrplot(chisq$residuals, is.cor = FALSE)
dt_cs <- as.table(as.matrix(table(nonUse_new$ISINTERNETUSER,nonUse_new$EMPLOYMENTSTATUS)))
chisq <- chisq.test(dt_cs)
round(chisq$residuals, 3)
##
## Employed Not in the labour force Unemployed
## No -31.458 41.921 -6.762
## Yes 17.754 -23.658 3.816
corrplot(chisq$residuals, is.cor = FALSE)
dt_cs <- as.table(as.matrix(table(nonUse_new$ISINTERNETUSER,nonUse_new$HOUSEHOLDEDUCATION)))
chisq <- chisq.test(dt_cs)
round(chisq$residuals, 3)
##
## College or some post-secondary High school or less
## No -9.912 41.567
## Yes 5.594 -23.459
##
## University certificate or degree
## No -27.882
## Yes 15.736
corrplot(chisq$residuals, is.cor = FALSE)
dt_cs <- as.table(as.matrix(table(nonUse_new$ISINTERNETUSER,nonUse_new$HOUSEHOLDEDUCATION)))
chisq <- chisq.test(dt_cs)
round(chisq$residuals, 3)
##
## College or some post-secondary High school or less
## No -9.912 41.567
## Yes 5.594 -23.459
##
## University certificate or degree
## No -27.882
## Yes 15.736
corrplot(chisq$residuals, is.cor = FALSE)
dt <- as.table(as.matrix(table(nonUse_new$NU_COST,nonUse_new$GENDER)))
balloonplot(t(dt), main ="Cost is the reason for Non use across Gender", xlab ="", ylab="",
label = FALSE, show.margins = FALSE)
cost_gender_yn = nonUse_new %>% select(NU_COST,GENDER) %>%
filter(NU_COST =="Yes" | NU_COST =="No" )
counts_cost_gender_yn = table(cost_gender_yn$NU_COST,cost_gender_yn$GENDER)
# Plotting the bar chart for bivariate analysis
cost_gender_chart <- as.data.frame(counts_cost_gender_yn) #created a frequency table
ggplot(cost_gender_chart, aes(x = Var2, y = Freq, fill = Var1)) + labs(title = "Is Cost the reason, eople don't use Internet?", subtitle = "Reply of No and yes based on Gender?", caption = "R Project") + xlab("Choice of Reason") +ylab("Frequency of Answers") +
geom_col() +
geom_text(aes(label = Freq), vjust = 3.5, colour = "white")
dt <- as.table(as.matrix(table(nonUse_new$NU_NOINTEREST,nonUse_new$GENDER)))
balloonplot(t(dt), main ="No Interest is the reason for Non use across Gender", xlab ="", ylab="",
label = FALSE, show.margins = FALSE)
interest_gender_yn = nonUse_new %>% select(NU_NOINTEREST,GENDER) %>%
filter(NU_NOINTEREST =="Yes" | NU_NOINTEREST =="No" )
counts_interest_gender_yn = table(interest_gender_yn$NU_NOINTEREST,interest_gender_yn$GENDER)
interest_gender_chart <- as.data.frame(counts_interest_gender_yn) #created a frequency table
ggplot(interest_gender_chart, aes(x = Var2, y = Freq, fill = Var1)) + labs(title = "Is No Interest the reason, people don't use Internet?", subtitle = "Reply of No and yes based on Gender?", caption = "R Project") + xlab("Choice of Reason") +ylab("Frequency of Answers") +
geom_col() +
geom_text(aes(label = Freq), vjust = 3.5, colour = "white")
dt <- as.table(as.matrix(table(nonUse_new$NU_LIMITEDACCESS,nonUse_new$GENDER)))
balloonplot(t(dt), main ="Limited Access is the reason for Non use across Gender", xlab ="", ylab="",
label = FALSE, show.margins = FALSE)
access_gender_yn = nonUse_new %>% select(NU_LIMITEDACCESS,GENDER) %>%
filter(NU_LIMITEDACCESS =="Yes" | NU_LIMITEDACCESS =="No" )
counts_access_gender_yn = table(access_gender_yn$NU_LIMITEDACCESS,access_gender_yn$GENDER)
access_gender_chart <- as.data.frame(counts_access_gender_yn) #created a frequency table
ggplot(access_gender_chart, aes(x = Var2, y = Freq, fill = Var1)) + labs(title = "Is Limited Access the reason, people don't use Internet?", subtitle = "Reply of No and yes based on Gender?", caption = "R Project") + xlab("Choice of Reason") +ylab("Frequency of Answers") +
geom_col() +
geom_text(aes(label = Freq), vjust = 3.5, colour = "white")
dt <- as.table(as.matrix(table(nonUse_new$NU_DIFFICULT,nonUse_new$GENDER)))
balloonplot(t(dt), main ="Difficulty is the reason for Non use across Gender", xlab ="", ylab="",
label = FALSE, show.margins = FALSE)
difficulty_gender_yn = nonUse_new %>% select(NU_DIFFICULT,GENDER) %>%
filter(NU_DIFFICULT =="Yes" | NU_DIFFICULT =="No" )
counts_difficulty_gender_yn = table(difficulty_gender_yn$NU_DIFFICULT,difficulty_gender_yn$GENDER)
difficulty_gender_chart <- as.data.frame(counts_difficulty_gender_yn) #created a frequency table
ggplot(difficulty_gender_chart, aes(x = Var2, y = Freq, fill = Var1)) + labs(title = "Is Difficulty the reason, people don't use Internet?", subtitle = "Reply of No and yes based on Gender?", caption = "R Project") + xlab("Choice of Reason") +ylab("Frequency of Answers") +
geom_col() +
geom_text(aes(label = Freq), vjust = 3.5, colour = "white")
dt <- as.table(as.matrix(table(nonUse_new$NU_NONEED,nonUse_new$GENDER)))
balloonplot(t(dt), main ="No Need is the reason for Non use across Gender", xlab ="", ylab="",
label = FALSE, show.margins = FALSE)
need_gender_yn = nonUse_new %>% select(NU_NONEED,GENDER) %>%
filter(NU_NONEED =="Yes" | NU_NONEED =="No" )
counts_need_gender_yn = table(need_gender_yn$NU_NONEED,need_gender_yn$GENDER)
need_gender_chart <- as.data.frame(counts_need_gender_yn) #created a frequency table
ggplot(need_gender_chart, aes(x = Var2, y = Freq, fill = Var1)) + labs(title = "Is No Need the reason, eople don't use Internet?", subtitle = "Reply of No and yes based on Gender?", caption = "R Project") + xlab("Choice of Reason") +ylab("Frequency of Answers") +
geom_col() +
geom_text(aes(label = Freq), vjust = 3.5, colour = "white")
dt <- as.table(as.matrix(table(nonUse_new$NU_NOTIME,nonUse_new$GENDER)))
balloonplot(t(dt), main ="No Time is the reason for Non use across Gender", xlab ="", ylab="",
label = FALSE, show.margins = FALSE)
time_gender_yn = nonUse_new %>% select(NU_NOTIME,GENDER) %>%
filter(NU_NOTIME =="Yes" | NU_NOTIME =="No" )
counts_time_gender_yn = table(time_gender_yn$NU_NOTIME,time_gender_yn$GENDER)
time_gender_chart <- as.data.frame(counts_time_gender_yn) #created a frequency table
ggplot(time_gender_chart, aes(x = Var2, y = Freq, fill = Var1)) + labs(title = "Is No Time the reason, eople don't use Internet?", subtitle = "Reply of No and yes based on Gender?", caption = "R Project") + xlab("Choice of Reason") +ylab("Frequency of Answers") +
geom_col() +
geom_text(aes(label = Freq), vjust = 3.5, colour = "white")
dt <- as.table(as.matrix(table(nonUse_new$NU_LACKOFSKILLS,nonUse_new$GENDER)))
balloonplot(t(dt), main ="Lack of Skill is the reason for Non use across Gender", xlab ="", ylab="",
label = FALSE, show.margins = FALSE)
skills_gender_yn = nonUse_new %>% select(NU_LACKOFSKILLS,GENDER) %>%
filter(NU_LACKOFSKILLS =="Yes" | NU_LACKOFSKILLS =="No" )
counts_skills_gender_yn = table(skills_gender_yn$NU_LACKOFSKILLS,skills_gender_yn$GENDER)
skills_gender_chart <- as.data.frame(counts_skills_gender_yn) #created a frequency table
ggplot(skills_gender_chart, aes(x = Var2, y = Freq, fill = Var1)) + labs(title = "Is Lack of Skills the reason, eople don't use Internet?", subtitle = "Reply of No and yes based on Gender?", caption = "R Project") + xlab("Choice of Reason") +ylab("Frequency of Answers") +
geom_col() +
geom_text(aes(label = Freq), vjust = 3.5, colour = "white")
dt <- as.table(as.matrix(table(nonUse_new$NU_AGEREASONS,nonUse_new$GENDER)))
balloonplot(t(dt), main ="Age is the reason for Non use across Gender", xlab ="", ylab="",
label = FALSE, show.margins = FALSE)
age_gender_yn = nonUse_new %>% select(NU_AGEREASONS,GENDER) %>%
filter(NU_AGEREASONS =="Yes" | NU_AGEREASONS =="No" )
counts_age_gender_yn = table(age_gender_yn$NU_AGEREASONS,age_gender_yn$GENDER)
age_gender_chart <- as.data.frame(counts_age_gender_yn) #created a frequency table
ggplot(age_gender_chart, aes(x = Var2, y = Freq, fill = Var1)) + labs(title = "Is Age Reasons the reason, eople don't use Internet?", subtitle = "Reply of No and yes based on Gender?", caption = "R Project") + xlab("Choice of Reason") +ylab("Frequency of Answers") +
geom_col() +
geom_text(aes(label = Freq), vjust = 3.5, colour = "white")
cost_yn = nonUse_new %>% select(NU_COST) %>%
filter(NU_COST =="Yes" | NU_COST =="No" )
cost_chart <- as.data.frame(table(cost_yn$NU_COST))
cost_chart$Choice_of_Reasons = cost_chart$Var1
cost_chart$Var1 = NULL
#created a frequency table
ggplot(cost_chart, aes(x = Choice_of_Reasons, y = Freq, fill= Choice_of_Reasons)) + labs(title = "Why People don't use Internet?", subtitle = "Is Cost the major reason?", caption = "R Project") +xlab("Choice of Reason") +ylab("Frequency of Answers") +
geom_col() +
geom_text(aes(label = Freq), vjust = 3.5, colour = "white")
interest_yn = nonUse_new %>% select(NU_NOINTEREST) %>%
filter(NU_NOINTEREST =="Yes" | NU_NOINTEREST =="No" )
interest_chart <- as.data.frame(table(interest_yn$NU_NOINTEREST))
interest_chart$Choice_of_Reasons = interest_chart$Var1
interest_chart$Var1 = NULL
#Create a frequency table
ggplot(interest_chart, aes(x = Choice_of_Reasons, y = Freq, fill= Choice_of_Reasons)) + labs(title = "Why People don't use Internet?", subtitle = "Is Lack of Interest the major reason?", caption = "R Project") +xlab("Choice of Reason") +ylab("Frequency of Answers") +
geom_col() +
geom_text(aes(label = Freq), vjust = 1.5, colour = "white")
access_yn = nonUse_new %>% select(NU_LIMITEDACCESS) %>%
filter(NU_LIMITEDACCESS =="Yes" | NU_LIMITEDACCESS =="No" )
access_chart <- as.data.frame(table(access_yn$NU_LIMITEDACCESS))
access_chart$Choice_of_Reasons = access_chart$Var1
access_chart$Var1 = NULL
#Create a frequency table
ggplot(access_chart, aes(x = Choice_of_Reasons, y = Freq, fill= Choice_of_Reasons)) + labs(title = "Why People don't use Internet?", subtitle = "Is Limited Access the major reason?", caption = "R Project") +xlab("Choice of Reason") +ylab("Frequency of Answers") +
geom_col() +
geom_text(aes(label = Freq), vjust = 1.5, colour = "white")
difficulty_yn = nonUse_new %>% select(NU_DIFFICULT) %>%
filter(NU_DIFFICULT =="Yes" | NU_DIFFICULT =="No" )
difficulty_chart <- as.data.frame(table(difficulty_yn$NU_DIFFICULT))
difficulty_chart$Choice_of_Reasons = difficulty_chart$Var1
difficulty_chart$Var1 = NULL
#Create a frequency table
ggplot(difficulty_chart, aes(x = Choice_of_Reasons, y = Freq, fill= Choice_of_Reasons)) + labs(title = "Why People don't use Internet?", subtitle = "Is Difficulty the major reason?", caption = "R Project") +xlab("Choice of Reason") +ylab("Frequency of Answers") +
geom_col() +
geom_text(aes(label = Freq), vjust = 1.5, colour = "white")
need_yn = nonUse_new %>% select(NU_NONEED) %>%
filter(NU_NONEED =="Yes" | NU_NONEED =="No" )
need_chart <- as.data.frame(table(need_yn$NU_NONEED))
need_chart$Choice_of_Reasons = need_chart$Var1
need_chart$Var1 = NULL
#Create a frequency table
ggplot(need_chart, aes(x = Choice_of_Reasons, y = Freq, fill= Choice_of_Reasons)) + labs(title = "Why People don't use Internet?", subtitle = "Is Lack of need the major reason?", caption = "R Project") +xlab("Choice of Reason") +ylab("Frequency of Answers") +
geom_col() +
geom_text(aes(label = Freq), vjust = 1.5, colour = "white")
time_yn = nonUse_new %>% select(NU_NOTIME) %>%
filter(NU_NOTIME =="Yes" | NU_NOTIME =="No" )
time_chart <- as.data.frame(table(time_yn$NU_NOTIME))
time_chart$Choice_of_Reasons = time_chart$Var1
time_chart$Var1 = NULL
#Created a frequency table
ggplot(time_chart, aes(x = Choice_of_Reasons, y = Freq, fill= Choice_of_Reasons)) + labs(title = "Why People don't use Internet?", subtitle = "Is Lack of time the major reason?", caption = "R Project") +xlab("Choice of Reason") +ylab("Frequency of Answers") +
geom_col() +
geom_text(aes(label = Freq), vjust = 1.5, colour = "white")
skills_yn = nonUse_new %>% select(NU_LACKOFSKILLS) %>%
filter(NU_LACKOFSKILLS =="Yes" | NU_LACKOFSKILLS =="No" )
skills_chart <- as.data.frame(table(skills_yn$NU_LACKOFSKILLS))
skills_chart$Choice_of_Reasons = skills_chart$Var1
skills_chart$Var1 = NULL
#created a frequency table
ggplot(skills_chart, aes(x = Choice_of_Reasons, y = Freq, fill= Choice_of_Reasons)) + labs(title = "Why People don't use Internet?", subtitle = "Is Lack of skills the major reason?", caption = "R Project") +xlab("Choice of Reason") +ylab("Frequency of Answers") +
geom_col() +
geom_text(aes(label = Freq), vjust = 1.5, colour = "white")
age_yn = nonUse_new %>% select(NU_AGEREASONS) %>%
filter(NU_AGEREASONS =="Yes" | NU_AGEREASONS =="No" )
age_chart <- as.data.frame(table(age_yn$NU_AGEREASONS))
age_chart$Choice_of_Reasons = age_chart$Var1
age_chart$Var1 = NULL
#Created a frequency table
ggplot(age_chart, aes(x = Choice_of_Reasons, y = Freq, fill= Choice_of_Reasons)) + labs(title = "Why People don't use Internet?", subtitle = "Is Age the major reason?", caption = "R Project") +xlab("Choice of Reason") +ylab("Frequency of Answers") +
geom_col() +
geom_text(aes(label = Freq), vjust = 1.5, colour = "white")
#Pie Chart showing proportion of people based on education level
eol_table <- table(nonUse_new$EDUCATIONLEVEL) #table of counts
proportion_table <- prop.table(eol_table)#table of proportion
piepercent<- round(100*eol_table/sum(eol_table), 1)
piepercent
##
## College or some post-secondary High school or less
## 42.1 39.2
## University certificate or degree
## 18.7
pie(proportion_table,labels =piepercent, main = "Proportion of people based on education level", col = rainbow(length(proportion_table)))
legend("topright", c("College or some post-secondary","High school or less","University certificate or degree"), cex = 0.7,fill = rainbow(length(proportion_table)))
#Pie Chart showing proportion of years of usage
you_table <- table(nonUse_new$INTERNETUSEDURATION) #table of counts
proportion_table <- prop.table(you_table)#table of proportion
piepercent<- round(100*you_table/sum(you_table), 1)
piepercent
##
## 1 to 2 years 2 to 5 years 5 or more years Don't know
## 3.8 11.2 58.1 0.1
## Less than 1 year Refusal Valid skip
## 2.6 0.0 24.2
pie(proportion_table,labels =piepercent, main = "Proportion of years of Internet usage", col = rainbow(length(proportion_table)))
legend("topright", c("1 to 2 years","2 to 5 years","5 or more years","Don't know","Less than 1 year","Refusal","Valid skip"), cex = 0.7,fill = rainbow(length(proportion_table)))
data = nonUse_new
cols = names(data)
data[cols] <- lapply(data[cols] , factor)
# Splitting dataset
split <- sample.split(data, SplitRatio = 0.8)
split
## [1] FALSE TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
## [13] TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
train_reg <- subset(data, split == "TRUE")
test_reg <- subset(data, split == "FALSE")
# Training model
logistic_model <- glm(ISINTERNETUSER ~ PROVINCE1 + EDUCATIONLEVEL + GENDER + HOUSEHOLDEDUCATION + EMPLOYMENTSTATUS + HOUSEHOLDTYPE ,
data = train_reg,
family = "binomial")
# Summary
summary(logistic_model)
##
## Call:
## glm(formula = ISINTERNETUSER ~ PROVINCE1 + EDUCATIONLEVEL + GENDER +
## HOUSEHOLDEDUCATION + EMPLOYMENTSTATUS + HOUSEHOLDTYPE, family = "binomial",
## data = train_reg)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1964 0.1203 0.3380 0.6032 1.9712
##
## Coefficients:
## Estimate
## (Intercept) 2.76685
## PROVINCE1British Columbia 0.24495
## PROVINCE1Manitoba -0.25498
## PROVINCE1New Brunswick -0.38722
## PROVINCE1Newfoundland and Labrador -0.71228
## PROVINCE1Nova Scotia -0.32365
## PROVINCE1Ontario 0.06715
## PROVINCE1Prince Edward Island -0.27067
## PROVINCE1Quebec -0.25440
## PROVINCE1Saskatchewan -0.30021
## EDUCATIONLEVELHigh school or less -0.58006
## EDUCATIONLEVELUniversity certificate or degree 0.58315
## GENDERMALE -0.19527
## HOUSEHOLDEDUCATIONHigh school or less -0.63372
## HOUSEHOLDEDUCATIONUniversity certificate or degree 0.71417
## EMPLOYMENTSTATUSNot in the labour force -1.57880
## EMPLOYMENTSTATUSUnemployed -0.01355
## HOUSEHOLDTYPEOne person households -0.85482
## HOUSEHOLDTYPESingle family household with unmarried children under 16 0.79333
## HOUSEHOLDTYPESingle family household without unmarried children under 16 -0.39813
## Std. Error
## (Intercept) 0.14119
## PROVINCE1British Columbia 0.09790
## PROVINCE1Manitoba 0.09694
## PROVINCE1New Brunswick 0.11301
## PROVINCE1Newfoundland and Labrador 0.11957
## PROVINCE1Nova Scotia 0.10917
## PROVINCE1Ontario 0.08118
## PROVINCE1Prince Edward Island 0.14488
## PROVINCE1Quebec 0.08404
## PROVINCE1Saskatchewan 0.10289
## EDUCATIONLEVELHigh school or less 0.06484
## EDUCATIONLEVELUniversity certificate or degree 0.12723
## GENDERMALE 0.04200
## HOUSEHOLDEDUCATIONHigh school or less 0.06647
## HOUSEHOLDEDUCATIONUniversity certificate or degree 0.10041
## EMPLOYMENTSTATUSNot in the labour force 0.04352
## EMPLOYMENTSTATUSUnemployed 0.11257
## HOUSEHOLDTYPEOne person households 0.12631
## HOUSEHOLDTYPESingle family household with unmarried children under 16 0.13609
## HOUSEHOLDTYPESingle family household without unmarried children under 16 0.12350
## z value
## (Intercept) 19.597
## PROVINCE1British Columbia 2.502
## PROVINCE1Manitoba -2.630
## PROVINCE1New Brunswick -3.426
## PROVINCE1Newfoundland and Labrador -5.957
## PROVINCE1Nova Scotia -2.965
## PROVINCE1Ontario 0.827
## PROVINCE1Prince Edward Island -1.868
## PROVINCE1Quebec -3.027
## PROVINCE1Saskatchewan -2.918
## EDUCATIONLEVELHigh school or less -8.947
## EDUCATIONLEVELUniversity certificate or degree 4.584
## GENDERMALE -4.649
## HOUSEHOLDEDUCATIONHigh school or less -9.533
## HOUSEHOLDEDUCATIONUniversity certificate or degree 7.112
## EMPLOYMENTSTATUSNot in the labour force -36.280
## EMPLOYMENTSTATUSUnemployed -0.120
## HOUSEHOLDTYPEOne person households -6.768
## HOUSEHOLDTYPESingle family household with unmarried children under 16 5.829
## HOUSEHOLDTYPESingle family household without unmarried children under 16 -3.224
## Pr(>|z|)
## (Intercept) < 2e-16
## PROVINCE1British Columbia 0.012350
## PROVINCE1Manitoba 0.008532
## PROVINCE1New Brunswick 0.000612
## PROVINCE1Newfoundland and Labrador 2.57e-09
## PROVINCE1Nova Scotia 0.003031
## PROVINCE1Ontario 0.408150
## PROVINCE1Prince Edward Island 0.061740
## PROVINCE1Quebec 0.002469
## PROVINCE1Saskatchewan 0.003524
## EDUCATIONLEVELHigh school or less < 2e-16
## EDUCATIONLEVELUniversity certificate or degree 4.57e-06
## GENDERMALE 3.33e-06
## HOUSEHOLDEDUCATIONHigh school or less < 2e-16
## HOUSEHOLDEDUCATIONUniversity certificate or degree 1.14e-12
## EMPLOYMENTSTATUSNot in the labour force < 2e-16
## EMPLOYMENTSTATUSUnemployed 0.904193
## HOUSEHOLDTYPEOne person households 1.31e-11
## HOUSEHOLDTYPESingle family household with unmarried children under 16 5.57e-09
## HOUSEHOLDTYPESingle family household without unmarried children under 16 0.001266
##
## (Intercept) ***
## PROVINCE1British Columbia *
## PROVINCE1Manitoba **
## PROVINCE1New Brunswick ***
## PROVINCE1Newfoundland and Labrador ***
## PROVINCE1Nova Scotia **
## PROVINCE1Ontario
## PROVINCE1Prince Edward Island .
## PROVINCE1Quebec **
## PROVINCE1Saskatchewan **
## EDUCATIONLEVELHigh school or less ***
## EDUCATIONLEVELUniversity certificate or degree ***
## GENDERMALE ***
## HOUSEHOLDEDUCATIONHigh school or less ***
## HOUSEHOLDEDUCATIONUniversity certificate or degree ***
## EMPLOYMENTSTATUSNot in the labour force ***
## EMPLOYMENTSTATUSUnemployed
## HOUSEHOLDTYPEOne person households ***
## HOUSEHOLDTYPESingle family household with unmarried children under 16 ***
## HOUSEHOLDTYPESingle family household without unmarried children under 16 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 20368 on 18347 degrees of freedom
## Residual deviance: 14901 on 18328 degrees of freedom
## AIC: 14941
##
## Number of Fisher Scoring iterations: 5
# Predict test data based on model
predict_reg <- predict(logistic_model,
test_reg, type = "response")
# Changing probabilities #ABOVE 0.5=YES
predict_reg <- ifelse(predict_reg >0.5, 1, 0)
# Evaluating model accuracy
# using confusion matrix
table(test_reg$ISINTERNETUSER, predict_reg)
## predict_reg
## 0 1
## No 520 612
## Yes 257 3441
# ROC-AUC Curve
ROCPred <- prediction(predict_reg, test_reg$ISINTERNETUSER)
ROCPer <- performance(ROCPred, measure = "tpr",
x.measure = "fpr")
# Plotting curve
plot(ROCPer)
plot(ROCPer, colorize = TRUE,
print.cutoffs.at = seq(0.1, by = 0.1),
main = "ROC CURVE")
abline(a = 0, b = 1)
auc <- performance(ROCPred, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.6949335
auc <- round(auc, 4)
legend(.6, .4, auc, title = "AUC", cex = 1)
####Logistics Model Analysis: Province, Gender , Household education level, employement status and household type are the significant drivers behind predicting a person is a internet user or not. The AUC = 0.7 which will help us predict future potential internet users in a new sample.
rf <- randomForest(ISINTERNETUSER~ PROVINCE1 + EDUCATIONLEVEL + GENDER + HOUSEHOLDEDUCATION + EMPLOYMENTSTATUS + HOUSEHOLDTYPE + HOUSEHOLDSIZE + COMMUNITY, data=train_reg, NTREE= 500)
(rf)
##
## Call:
## randomForest(formula = ISINTERNETUSER ~ PROVINCE1 + EDUCATIONLEVEL + GENDER + HOUSEHOLDEDUCATION + EMPLOYMENTSTATUS + HOUSEHOLDTYPE + HOUSEHOLDSIZE + COMMUNITY, data = train_reg, NTREE = 500)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 17.95%
## Confusion matrix:
## No Yes class.error
## No 2015 2452 0.54891426
## Yes 841 13040 0.06058641
# Importance plot
importance(rf)
## MeanDecreaseGini
## PROVINCE1 156.05098
## EDUCATIONLEVEL 320.21274
## GENDER 40.11033
## HOUSEHOLDEDUCATION 479.31773
## EMPLOYMENTSTATUS 739.87525
## HOUSEHOLDTYPE 205.59283
## HOUSEHOLDSIZE 249.90653
## COMMUNITY 87.31853
# Variable importance plot
varImpPlot(rf)
classifier_cl = naiveBayes(ISINTERNETUSER~ PROVINCE1 + EDUCATIONLEVEL + GENDER + HOUSEHOLDEDUCATION + EMPLOYMENTSTATUS + HOUSEHOLDTYPE + HOUSEHOLDSIZE, data=train_reg)
# Predicting on test data'
y_pred <- predict(classifier_cl, newdata = test_reg)
# Confusion Matrix
cm <- table(test_reg$ISINTERNETUSER, y_pred)
cm
## y_pred
## No Yes
## No 731 401
## Yes 558 3140
# Model Evaluation
confusionMatrix(cm)
## Confusion Matrix and Statistics
##
## y_pred
## No Yes
## No 731 401
## Yes 558 3140
##
## Accuracy : 0.8014
## 95% CI : (0.7899, 0.8126)
## No Information Rate : 0.7331
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4721
##
## Mcnemar's Test P-Value : 4.716e-07
##
## Sensitivity : 0.5671
## Specificity : 0.8868
## Pos Pred Value : 0.6458
## Neg Pred Value : 0.8491
## Prevalence : 0.2669
## Detection Rate : 0.1513
## Detection Prevalence : 0.2344
## Balanced Accuracy : 0.7269
##
## 'Positive' Class : No
##